In [258]:
%matplotlib inline
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
Import training data set from pre-created csv file
In [259]:
dataframe = pd.read_csv("data/auto_training_for_ipy.csv")
train_features = dataframe.ix[:,0:10]
train_labels = dataframe.ix[:,10:11]
train_labels['label'] = train_labels['label'].astype(int)
train_labels['label'] = train_labels['label'].map({-1: 0, 1: 1})
In [260]:
print('features: ', train_features.shape)
print('labels : ', train_labels.shape)
In [261]:
train_features.head()
Out[261]:
In [262]:
train_features.describe()
Out[262]:
Distribution of labels
In [263]:
(train_features
.term_score
.plot
.line(lw=0.8))
plt.title('Term score')
plt.xlabel('ID')
Out[263]:
In [264]:
train_features.term_score.hist()
Out[264]:
In [265]:
print('mean: ', train_features.mean()[0])
print('var :', train_features.var()[0])
To get a better understanding of the problem domain, take a look at the correlation matrix
In [266]:
import seaborn as sns
correlations = train_features.corr()
corr_heat = sns.heatmap(correlations)
plt.title('Ranking feature Correlations')
Out[266]:
In [267]:
(correlations
.term_score
.drop('term_score') # don't compare with myself
.sort_values(ascending=False)
.plot
.barh())
Out[267]:
Import human labelled testing set
In [268]:
dataframe = pd.read_csv("data/humanlabelled_for_ipy.csv")
test_features = dataframe.ix[:,0:10]
test_labels = dataframe.ix[:,10:11]
test_labels['label'] = test_labels['label'].astype(int)
test_labels['label'] = test_labels['label'].map({-1: 0, 1: 1})
In [269]:
from sklearn import preprocessing
# Train data with DL model
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.utils import plot_model
from keras.utils import np_utils
Data scaling
In [270]:
scaler = preprocessing.StandardScaler().fit(train_features)
train_features = scaler.transform(train_features)
test_features = scaler.transform(test_features)
train_labels = np_utils.to_categorical(train_labels, num_classes=2)
test_labels = np_utils.to_categorical(test_labels, num_classes=2)
Build a sequential NN model
In [271]:
model = Sequential()
model.add(Dense(8, input_dim=10, activation='relu'))
model.add(Dense(6, activation='relu'))
model.add(Dense(4, activation='relu'))
# model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax'))
model.compile(optimizer='rmsprop',
loss='binary_crossentropy',
metrics=['accuracy'])
Batch processing
In [272]:
# train_loss_and_metrics = model.fit(train_features, train_labels, epochs = 5, batch_size = 32)
# print(train_loss_and_metrics)
# test_loss_and_metrics = model.evaluate(test_features, test_labels, batch_size=128)
# print(test_loss_and_metrics)
Online learning
In [ ]:
test_accuracy = [];
test_loss = [];
train_accuracy = [];
train_loss = [];
increment = 64
chunks_train_data = [train_features[x:x+increment] for x in range(0, len(train_features), increment)]
chunks_train_labels = [train_labels[x:x+increment] for x in range(0, len(train_features), increment)]
for epoch in range(0, 5):
for i, el in enumerate(chunks_train_data):
print(i)
train_loss_and_metrics = model.train_on_batch(el, chunks_train_labels[i])
#print(train_loss_and_metrics)
train_loss.append(train_loss_and_metrics[0])
train_accuracy.append(train_loss_and_metrics[1])
test_loss_and_metrics = model.evaluate(test_features, test_labels, batch_size=128)
#print(test_loss_and_metrics)
test_loss.append(test_loss_and_metrics[0])
test_accuracy.append(test_loss_and_metrics[1])
Visualize the learning curve
In [274]:
fig = plt.figure()
ax1 = fig.add_subplot(211)
ax1.plot(train_loss)
ax1.plot(test_loss)
ax1.set_ylabel('Loss')
ax1.set_xlabel('Iteration')
ax1.legend(['training', 'testing'], loc='upper left')
ax2 = fig.add_subplot(212)
ax2.plot(train_accuracy)
ax2.plot(test_accuracy)
ax2.set_ylabel('Accuracy')
# ax2.set_xlabel('Iteration')
ax2.legend(['training', 'testing'], loc='upper left')
Out[274]:
In [ ]: